##Project Title: “Dynamic Legitimation Theory and Populist Authoritarian Leaders:
##                The Rhetorical Framing of International Status for Domestic Legitimacy”
##Author(s): John C. Stanko
##Affiliation: Indiana University, Bloomington
##Journal: Political Communication (conditionally accepted)
##Date of Last Edit: May 30, 2025


####
##Install and load needed R packages

#List of required packages
package_list <- c("dplyr", "flextable", "ggplot2", "officer", "quanteda", "quanteda.textstats",
                  "quanteda.textplots", "readxl", "readtext", "stm", "textTools", "tidyr")

#Install any needed non-base packages
install.packages(package_list)

#Load required packages (in alphabetical order)
lapply(package_list, library, character.only = T)


####
###Load in the state of the nation address data
##Update the file path as needed; you should specify the folder where the speeches are saved in TXT format.
speeches <- readtext("./JCStanko_PolComm_Populist_Authoritarian_Leaders_Speech_Corpus",
                     docvarsfrom = "filenames",
                     docvarnames = c("leader", "iso_a2", "type", "year"))
head(speeches)


####
##Create corpus for analysis

#Convert txt files to corpus
speech_corpus <- corpus(speeches)
ndoc(speech_corpus) #N: 22 and O: 12 and Z: 8

#Create speech metadata
dvars <- docvars(speech_corpus)

#Check corpus contents
summary(speech_corpus)


####
##Create a sub-corpus for each of the 3 leaders

#Create the Nursultan Nazarbayev sub-corpus
nazarbayev_subcorpus <- corpus_subset(speech_corpus, leader == "Nazarbayev")
naz_dvars <- docvars(nazarbayev_subcorpus)
summary(nazarbayev_subcorpus)

#Create the Viktor Orban sub-corpus
orban_subcorpus <- corpus_subset(speech_corpus, leader == "Orbán")
orb_dvars <- docvars(orban_subcorpus)
summary(orban_subcorpus)

#Create the Jacob Zuma sub-corpus
zuma_subcorpus <- corpus_subset(speech_corpus, leader == "Zuma")
zum_dvars <- docvars(zuma_subcorpus)
summary(zuma_subcorpus)


####
##Derive word tokens from the corpora, removing numbers and punctuation

#Get the Nazarbayev tokens
nazarbayev_toks <- tokens(nazarbayev_subcorpus, remove_numbers = T, remove_punct = T)

#Get the Orban tokens
orban_toks <- tokens(orban_subcorpus, remove_numbers = T, remove_punct = T)

##Get the Zuma tokens
zuma_toks <- tokens(zuma_subcorpus, remove_numbers = T, remove_punct = T)


####
##Remove common stopwords (such as grammatical articles) from the corpora

#Create a variable for the database of stopwords using the textTools package
stoppers <- stopwords() 
head(stoppers)

####
##Recreate the sub-corpora without stopwords

#Create the streamlined Nazarbayev corpus
nazarbayev_toks_minimal <- tokens_remove(nazarbayev_toks, stoppers, padding = TRUE)

#Create the streamlined Orban corpus
orban_toks_minimal <- tokens_remove(orban_toks, stoppers, padding = TRUE)

#Create the streamlined Zuma corpus
zuma_toks_minimal <- tokens_remove(zuma_toks, stoppers, padding = TRUE)


#######

##NOT USED IN THE FINAL ANALYSIS##
##Exploratory examination of collocations in the corpora

#Nazarbayev collocations
nazarbayev_coll3 <- textstat_collocations(nazarbayev_toks_minimal, tolower = FALSE, size = 3)

#Orban collocations
orban_coll3 <- textstat_collocations(orban_toks_minimal, tolower = FALSE, size = 3)

#Zuma collocations
zuma_coll3 <- textstat_collocations(zuma_toks_minimal, tolower = FALSE, size = 3)


#######

###Use the keywords-in-context function to identify usages of the project's 13 dictionary terms

##Identify usages of status keyword "appreciation"

#Nazarbayev
kw_appreciate_nazarbayev <- kwic(nazarbayev_toks, "appreciat*", window = 25)
head(kw_appreciate_nazarbayev)

#Orban
kw_appreciate_orban <- kwic(orban_toks, "appreciat*", window = 25)
head(kw_appreciate_orban)

#Zuma
kw_appreciate_zuma <- kwic(zuma_toks, "appreciat*", window = 25)
head(kw_appreciate_zuma)


####

##Identify usages of status keyword "acknowledgment"

#Nazarbayev
kw_acknowledge_nazarbayev <- kwic(nazarbayev_toks, "acknowledg*", window = 25)
head(kw_acknowledge_nazarbayev)

#Orban
kw_acknowledge_orban <- kwic(orban_toks, "acknowledg*", window = 25)
head(kw_acknowledge_orban)

#Zuma
kw_acknowledge_zuma <- kwic(zuma_toks, "acknowledg*", window = 25)
head(kw_acknowledge_zuma)


####

##Identify usages of status keyword "distinction"

#Nazarbayev
kw_distinct_nazarbayev <- kwic(nazarbayev_toks, "distin*", window = 25)
head(kw_distinct_nazarbayev)

#Orban
kw_distinct_orban <- kwic(orban_toks, "distin*", window = 25)
head(kw_distinct_orban)

#Zuma
kw_distinct_zuma <- kwic(zuma_toks, "distin*", window = 25)
head(kw_distinct_zuma)


####

##Identify usages of status keyword "influence"

#Nazarbayev
kw_influence_nazarbayev <- kwic(nazarbayev_toks, "influen*", window = 25)
head(kw_influence_nazarbayev)


#Orban
kw_influence_orban <- kwic(orban_toks, "influen*", window = 25)
head(kw_influence_orban)

#Zuma
kw_influence_zuma <- kwic(zuma_toks, "influen*", window = 25)
head(kw_influence_zuma)


####

##Identify usages of status keyword "leader"

#Nazarbayev
kw_lead_nazarbayev <- kwic(nazarbayev_toks, "leader*", window = 25)
head(kw_lead_nazarbayev)

#Orban
kw_lead_orban <- kwic(orban_toks, "leader*", window = 25)
head(kw_lead_orban)

#Zuma
kw_lead_zuma <- kwic(zuma_toks, "leader*", window = 25)
head(kw_lead_zuma)


####

##Identify usages of status keyword "place"

#Nazarbayev
kw_place_nazarbayev <- kwic(nazarbayev_toks, "place*", window = 25)
head(kw_place_nazarbayev)

#Orban
kw_place_orban <- kwic(orban_toks, "place*", window = 25)
head(kw_place_orban)

#Zuma
kw_place_zuma <- kwic(zuma_toks, "place*", window = 25)
head(kw_place_zuma)


####

##Identify usages of status keyword "position"

#Nazarbayev
kw_position_nazarbayev <- kwic(nazarbayev_toks, "position*", window = 25)
head(kw_position_nazarbayev)

#Orban
kw_position_orban <- kwic(orban_toks, "position*", window = 25)
head(kw_position_orban)

#Zuma
kw_position_zuma <- kwic(zuma_toks, "position*", window = 25)
head(kw_position_zuma)


####

##Identify usages of status keyword "prestige"

#Nazarbayev
kw_prestige_nazarbayev <- kwic(nazarbayev_toks, "prestig*", window = 25)
head(kw_prestige_nazarbayev)

#Orban
kw_prestige_orban <- kwic(orban_toks, "prestig*", window = 25)
head(kw_prestige_orban)

#Zuma
kw_prestige_zuma <- kwic(zuma_toks, "prestig*", window = 25)
head(kw_prestige_zuma)


####

##Identify usages of status keyword "rank"

#Nazarbayev
kw_rank_nazarbayev <- kwic(nazarbayev_toks, "rank*", window = 25)
head(kw_rank_nazarbayev)

#Orban
kw_rank_orban <- kwic(orban_toks, "rank*", window = 25)
head(kw_rank_orban)

#Zuma
kw_rank_zuma <- kwic(zuma_toks, "rank*", window = 25)
head(kw_rank_zuma)


####

##Identify usages of status keyword "recognition"

#Nazarbayev
kw_recognition_nazarbayev <- kwic(nazarbayev_toks, "recogni*", window = 25)
head(kw_recognition_nazarbayev)

#Orban
kw_recognition_orban <- kwic(orban_toks, "recogni*", window = 25)
head(kw_recognition_orban)

#Zuma
kw_recognition_zuma <- kwic(zuma_toks, "recogni*", window = 25)
head(kw_recognition_zuma)


####

##Identify usages of status keyword "respect"

#Nazarbayev
kw_respect_nazarbayev <- kwic(nazarbayev_toks, "respect*", window = 25)
head(kw_respect_nazarbayev)

#Orban
kw_respect_orban <- kwic(orban_toks, "respect*", window = 25)
head(kw_respect_orban)

#Zuma
kw_respect_zuma <- kwic(zuma_toks, "respect*", window = 25)
head(kw_respect_zuma)


####

##Identify usages of status keyword "role"

#Nazarbayev
kw_role_nazarbayev <- kwic(nazarbayev_toks, "role*", window = 25)
head(kw_role_nazarbayev)

#Orban
kw_role_orban <- kwic(orban_toks, "role*", window = 25)
head(kw_role_orban)

#Zuma
kw_role_zuma <- kwic(zuma_toks, "role*", window = 25)
head(kw_role_zuma)


####

##Identify usages of status keyword "status"

#Nazarbayev
kw_status_nazarbayev <- kwic(nazarbayev_toks, "status*", window = 25)
head(kw_status_nazarbayev)

#Orban
kw_status_orban <- kwic(orban_toks, "status*", window = 25)
head(kw_status_orban)

#Zuma
kw_status_zuma <- kwic(zuma_toks, "status*", window = 25)
head(kw_status_zuma)


#######

###Prepare data for structural topic modelling validation check
##Create a document-feature matrix for each of the 3 leaders from the streamlined corpora

#Nazarbayev
corpdfm_nazarbayev <- dfm(nazarbayev_toks_minimal)

#Orban
corpdfm_orban <- dfm(orban_toks_minimal)

#Zuma
corpdfm_zuma <- dfm(zuma_toks_minimal)


#######

##NOT USED IN THE ANALYSIS##
##Exploratory look at the top 40 words used in a given speech; here, Zuma's 2013 address
##Not central to the analysis, but can be changed, if interested
sub_corp <- dfm_subset(corpdfm_zuma, year == 2013)
dim(sub_corp)
featnames(sub_corp)[1:40]


#######

###Structural Topic Modelling Analysis
##Determine an appropriate number of clusters for the STM models

##First, create a streamlined version of the full corpus
corpus_toks <- tokens(speech_corpus, remove_numbers = T, remove_punct = T)
corpus_toks_minimal <- tokens_remove(corpus_toks, stoppers, padding = TRUE)
corpdfm_corpus <- dfm(corpus_toks_minimal)

##Second, use the searchK function from the stm package to identify the best number of clusters

#Format the sub-corpora for STM analysis
out_naz <- convert(corpdfm_nazarbayev, to = "stm")
out_orban <- convert(corpdfm_orban, to = "stm")
out_zuma <- convert(corpdfm_zuma, to = "stm")

#Run searchK on the Nazarbayev corpus
k_compare_naz <- searchK(out_naz$documents, out_naz$vocab, K = c(10, 15, 20, 25, 30), heldout.seed = 383)
plot(k_compare_naz)
###k = 10 has the highest held-out likelihood, the lowest residuals, and the highest semantic coherence.

#Run searchK on the Orban corpus
k_compare_orban <- searchK(out_orban$documents, out_orban$vocab, K = c(10, 15, 20, 25, 30), heldout.seed = 383)
plot(k_compare_orban)
###k = 10 again has the highest held-out likelihood, the lowest residuals, and the highest semantic coherence.

#Run searchK on the Zuma corpus
k_compare_zuma <- searchK(out_zuma$documents, out_zuma$vocab, K = c(10, 15, 20, 25, 30), heldout.seed = 383)
###Zuma's corpus does not have enough documents for the model to converge.
##Given that 10 was the optimal k for the other 2 corpora, I use that as the number of clusters moving forward.


####
##Run the STM analyses, using 10 clusters

#Nazarbayev
mod_nazarbayev <- stm(corpdfm_nazarbayev, K = 10, seed = 383)

#Orban
mod_orban <- stm(corpdfm_orban, K = 10, seed = 383)

#Zuma
mod_zuma <- stm(corpdfm_zuma, K = 10, seed = 383)


####
##Output the STM results in visual format

#Nazarbayev
plot(mod_nazarbayev, type = "labels", labeltype = "frex") # or frex, lift, score

#Orban
plot(mod_orban, type = "labels", labeltype = "frex") # or frex, lift, score

#Zuma
plot(mod_zuma, type = "labels", labeltype = "frex") # or frex, lift, score



#######
##NOT USED IN THE PUBLISHED ANALYSIS##
##If interested, the code below allows researchers to analyze the most and least
##popular phrases by year (not restricted to the 13 terms used herein).

#Create the Nazarbayev by-year corpus
dfm_nazarbayev_year <- dfm_group(corpdfm_nazarbayev, year)

#Create the Orban by-year corpus
dfm_orban_year <- dfm_group(corpdfm_orban, year)

#Create the Zuma by-year corpus
dfm_zuma_year <- dfm_group(corpdfm_zuma, year)


##As an example, the following code provides the "keyness" of various terms in Orban's 2017 speech.
terms.2017 <-  textstat_keyness(dfm_orban_year, "2017")
textplot_keyness(terms.2017)


#######
##Reproducing Table 1 and Figure 1
#The file used below was compiled manually during the course of the qualitative content analysis.
#The kwic results were pared down to only those excerpts which specifically refer to international status markers.

##Import the dataset of international status-related excerpts for each leader
qca_naz <- read_xlsx("./PolComm_Populist_Authoritarian_Leaders_Content_Analysis_Workbook.xlsx", sheet = "Nazarbayev")
qca_orban <- read_xlsx("./PolComm_Populist_Authoritarian_Leaders_Content_Analysis_Workbook.xlsx", sheet = "Orban")
qca_zuma <- read_xlsx("./PolComm_Populist_Authoritarian_Leaders_Content_Analysis_Workbook.xlsx", sheet = "Zuma")

#Reformat Nazarbayev excerpts for ease of use in R
qca_naz$Year <- as.numeric(qca_naz$Year)
qca_naz_compact <- subset(qca_naz, !is.na(Year))

#Reformat Orbán excerpts for ease of use in R
qca_orban$Year <- as.numeric(qca_orban$Year)
qca_orban_compact <- subset(qca_orban, !is.na(Year))

#Reformat Zuma excerpts for ease of use in R
qca_zuma$Year <- as.numeric(qca_zuma$Year)
qca_zuma_compact <- subset(qca_zuma, !is.na(Year))


####
##Reproducing Table 1 from the article
#Create a basic data frame with the status terms as rows and the leaders as columns
tab1 <- as.data.frame(matrix(0, ncol = 7, nrow = 13))
colnames(tab1) <- c("Status Term", "Nazarbayev realized", "Nazarbayev aspirational", "Orbán realized", "Orbán aspirational", "Zuma realized", "Zuma aspirational")
term_vars <- c("Appreciation", "Acknowledgment", "Distinction", "Influence", "Leader", "Place", "Position", "Prestige", "Rank", "Recognition", "Respect", "Role", "Status")
tab1[, 1] <- term_vars


#Tabulate the status term usage for Nazarbayev
#Iterate through the list of 13 status terms
for (j in 1:length(term_vars)) {
  #Segment into realized versus aspirational references
  for (k in 1:nrow(qca_naz_compact)) {
    #Realized status markers
    realized_count <- subset(qca_naz_compact, Root_KW == term_vars[j] & Progress == "Realized")
    tab1[j, 2] <- nrow(realized_count)
    
    #Aspirational status markers
    aspire_count <- subset(qca_naz_compact, Root_KW == term_vars[j] & Progress == "Aspirational")
    tab1[j, 3] <- nrow(aspire_count)
  }
}


#Tabulate the status term usage for Orbán
#Iterate through the list of 13 status terms
for (j in 1:length(term_vars)) {
  #Segment into realized versus aspirational references
  for (k in 1:nrow(qca_orban_compact)) {
    #Realized status markers
    realized_count <- subset(qca_orban_compact, Root_KW == term_vars[j] & Progress == "Realized")
    tab1[j, 4] <- nrow(realized_count)
    
    #Aspirational status markers
    aspire_count <- subset(qca_orban_compact, Root_KW == term_vars[j] & Progress == "Aspirational")
    tab1[j, 5] <- nrow(aspire_count)
  }
}


#Tabulate the status term usage for Zuma
#Iterate through the list of 13 status terms
for (j in 1:length(term_vars)) {
  #Segment into realized versus aspirational references
  for (k in 1:nrow(qca_zuma_compact)) {
    #Realized status markers
    realized_count <- subset(qca_zuma_compact, Root_KW == term_vars[j] & Progress == "Realized")
    tab1[j, 6] <- nrow(realized_count)
    
    #Aspirational status markers
    aspire_count <- subset(qca_zuma_compact, Root_KW == term_vars[j] & Progress == "Aspirational")
    tab1[j, 7] <- nrow(aspire_count)
  }
}


##Export for inclusion in a Word document
#Set the defaults to match standard Times New Roman 12 font settings
set_flextable_defaults(font.family = "Times New Roman", hansi.family = "Times New Roman", font.size = 12, text.align = "center")

#Create a flextable object
flextab1 <- flextable(data = tab1)
flextab1 <- set_caption(flextab1, caption = as_paragraph(as_b("Table 1: Breakdown of Usage of Specific International Status Markers")), align_with_table = T)

#Confirm table looks correct before exporting
flextab1

#Export the table to a Word document
save_as_docx(flextab1, path = "./PolComm_Populist_Authoritarian_Leaders_Table1.docx", align = "center")


####
##Reproducing Figure 1 from the article
#Create a combined data frame with all three leaders
year_vars <- c(1997:2024)
qca_naz_long <- data.frame("Year" = year_vars, "Leader" = "Nazarbayev", "Count" = 0)
qca_orban_long <- data.frame("Year" = year_vars, "Leader" = "Orbán", "Count" = 0)
qca_zuma_long <- data.frame("Year" = year_vars, "Leader" = "Zuma", "Count" = 0)


#Extract the count of yearly status marker references for N. Nazarbayev
for (j in 1:nrow(qca_naz_long)) {
  status_count <- subset(qca_naz_compact, Year == year_vars[j])
  qca_naz_long$Count[j] = nrow(status_count)
}

#Check that everything looks correct
View(qca_naz_long)


#Extract the count of yearly status marker references for V. Orbán
for (j in 1:nrow(qca_orban_long)) {
  status_count <- subset(qca_orban_compact, Year == year_vars[j])
  qca_orban_long$Count[j] = nrow(status_count)
}

#Check that everything looks correct
View(qca_orban_long)


#Extract the count of yearly status marker references for J. Zuma
for (j in 1:nrow(qca_zuma_long)) {
  status_count <- subset(qca_zuma_compact, Year == year_vars[j])
  qca_zuma_long$Count[j] = nrow(status_count)
}

#Check that everything looks correct
View(qca_zuma_long)

#Use the rbind function to compile into one large data frame
qca_combined <- rbind(qca_naz_long, qca_orban_long, qca_zuma_long)
View(qca_combined)


##
#Create a figure
fig1 <- ggplot(qca_combined, aes(fill = Leader, y = Count, x = Year)) + 
  geom_bar(position = "dodge", stat = "identity") + theme_minimal() +
  labs(x = "Year", y = "Status References", title = bquote(bold("Figure 1: Leaders' International Status Term Usage by Year"))) + 
  theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), legend.position = "bottom") +
  theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) +
  #theme(axis.text.x = element_text(angle = 90, hjust = 0, face = "bold")) + 
  theme(axis.text.x = element_text(vjust = 1, hjust = 1)) + 
  scale_fill_manual('Leader', values=c("#336699", "darkorange2", "darkgreen")) + 
  scale_x_continuous(breaks = year_vars) + 
  scale_y_continuous(breaks = c(seq(0, 30, 5)))

#View the figure in R
fig1

#Output the figure to a PDF file for local storage
ggsave("PolComm_Populist_Authoritarian_Leaders_Figure1.pdf", width = 11, height = 8.5, unit = "in")
dev.off()


#################
##END OF SCRIPT##
#################